R environment set-up

Loading packages

I first loaded required packages for data munging, visualization, and analysis (these are largely Hadley Wickham libraries, plus some Bioconductor tools).

rm(list = ls())
gc()
##          used (Mb) gc trigger (Mb) max used (Mb)
## Ncells 308428   16     592000 31.7   460000   25
## Vcells 512631    4    1023718  7.9   786237    6
# Load libraries I'll need here
library(edgeR)
library(limma)
library(biomaRt)
library(ggfortify)

# Load my go-to libraries
library(dplyr)
library(ggplot2)
library(ggthemes)
library(stringr)
library(readr)
library(readxl)
library(reshape2)

# Packages for R markdown stuff
library(knitr)
library(shiny)

 

Defining functions

Functions for plotting metrics are contained in metric_qc_functions.R.

source("R/metric_qc_functions.R")

This is a function written by Elizabeth Whalen (shared by Michael Mason) that might come in handy with some steps of the analysis. I modified the function slightly, such that library sizes are updated and normalization factors are calculated after filtering genes. I also added the option to input gene annotation information.

# Function to build DGEList object, filter genes by keeping only those having % 
# samples with at least N counts, and computes normalization from library sizes
setUpDGEList <- function(countData, geneData = NULL, 
                         filterCount = 1, filterPercentage = 0.1)
{
    d <- DGEList(counts = countData, genes = geneData)
    # d <- calcNormFactors(d) # moved further down
    
    # Filter all genes that do not have at least 'filterCount' counts per 
    # million in at least 'filterPercentage' percent of libraries
    keepRows <- rowSums(round(cpm(d$counts)) >= filterCount) >= 
        filterPercentage*ncol(countData)
    print(table(keepRows))

    curDGE <- d[keepRows, ]
    
    # James: I've added this change so that library sizes and normalization
    # factors will always be updated/calculated after filtering genes
    
    # reset library sizes
    curDGE$samples$lib.size <- colSums(curDGE$counts)
    
    # calculate normalization factors (effective library size = 
    # lib.size * norm.factor)
    curDGE <- calcNormFactors(curDGE)
    return(curDGE)
}

 

Loading data

Next, I read counts and metrics data for the project into R, along with sample annotation for project libraries.

# Read CSV file with read counts
countFile <- "data/HMMF2ADXX_combined_counts.csv"
countDat <- read_csv(countFile) # 37991 obs. of  18 variables
# str(countDat)

# Read CSV file with RNAseq/alignment metrics
metricFile <- "data/HMMF2ADXX_combined_metrics.csv"
metricDat <- read_csv(metricFile) # 16 obs. of  71 variables
# str(metricDat)

# Read XLSX file with sample annotation
designFile <- "data/JMD119 Sample Information .xlsx"
designDat <- read_excel(designFile, skip = 1) # 36 obs. of 18 variables
# str(designDat)

 

Cleaning data

I needed to do a bit of cleaning/formatting with variable names (column headers) to make life easier and avoid breaking downstream functions.

# Separate gene counts and gene symbols into separate objects, reformat
# variable names in countDat to only include libID
geneDat <- data_frame(ensemblID = countDat$geneName)
countDat <- countDat %>% 
    select(-geneName)
names(countDat) <- names(countDat) %>% 
    str_extract("lib[0-9]+")

# Reformat variable names in metrics data frame
names(metricDat) <- names(metricDat) %>% 
    str_to_lower() %>%  # change variable names to lower case
    make.unique(sep = "_") # de-dup variable names
names(metricDat)[1] <- "lib_id" # reformat libID variable name

# Reformat row names in metrics dataframe
metricDat <- metricDat %>% 
    mutate(lib_id = str_extract(lib_id, "lib[0-9]+"))

# Reformat variable names in design data frame
names(designDat) <- names(designDat) %>% 
    str_replace_all(" +", "_") %>% # replace spaces with underscores
    str_replace_all("#", "num") %>%  # replace # with 'num'
    str_replace_all("/", "_per_") %>% 
    str_replace_all("(\\(|\\))", "") %>% # remove parentheses
    str_to_lower() %>% # change to lower case
    str_replace("(?<=(lib))[a-z]+", "") %>% # replace 'library' with 'lib'
    make.unique(sep = "_") # de-dup variable names

# Remove empty rows from design data frame
designDat <- designDat %>% 
    filter(!is.na(lib_id))

 

I created a new object to store the salient information about groups in the study I want to compare.

groupDat <- designDat %>% 
    # extract knockout status (WT or BCAP) and HSC population (long or short'
    # term); combine into a single group vector
    mutate(koStatus = as.factor(tolower(str_extract(sample_name, "WT|BCAP"))),
           hscPop = as.factor(tolower(str_extract(hsc_population, 
                                                  "Long|Short"))),
           group = as.factor(str_c(koStatus, hscPop, sep = "_"))) %>% 
    select(libID = lib_id,
           koStatus, hscPop, group)

For reference, here are the relevant groups in the data (stored in groupDat):

libID koStatus hscPop group
lib7418 wt long wt_long
lib7419 wt long wt_long
lib7420 wt long wt_long
lib7421 wt long wt_long
lib7422 bcap long bcap_long
lib7423 bcap long bcap_long
lib7424 bcap long bcap_long
lib7425 bcap long bcap_long
lib7426 wt short wt_short
lib7427 wt short wt_short
lib7428 wt short wt_short
lib7429 wt short wt_short
lib7430 bcap short bcap_short
lib7431 bcap short bcap_short
lib7432 bcap short bcap_short
lib7433 bcap short bcap_short

Inspecting data

Metric plotting & QC

Next, I looked at a few standard metrics to see whether any libraries should be excluded due to quality reasons.

# Pull out and format the subset of metrics to plot
metricSummary <- metricDat %>% 
    mutate(percentDuplication = unpaired_read_duplicates / 
               unpaired_reads_examined) %>% 
    select(libID = lib_id, 
           medianCVcoverage = median_cv_coverage, 
           fastqTotalReads = fastq_total_reads, 
           percentAligned = mapped_reads_w_dups,
           percentDuplication)

Cutoffs are set by default to standard values used in the Bioinformatics Core for three metrics; libraries are considered to have ‘failed’ QC for the following conditions:

  • < 1 million total FASTQ reads
  • < 80% aligned reads
  • > 1.0 median CV coverage

I can also adjust slider bars to look at different QC cutoffs (red lines) for the x- and y-axis; dashed lines indicate outlier limits (1.5*IQR).

 

Percent aligned

Percent aligned is simply the number of FASTQ reads for which there is a corresponding alignment in the TopHat BAM file. In other words, percentAligned = # of aligned reads (+ all their duplicate reads, which were removed from the final BAM) / fastqTotalReads.

Percent aligned:
Percentage of aligned reads is well above the 80% cutoff for all libraries, with rates in the mid-90s across the board. lib7422 is outside the nominal outlier range, but still has 91.41% alignment.

Total reads:
While all libraries had well over 10 million reads in the input FASTQ file (after adapter trimming), lib7418 appears to be quite a bit smaller than average the average of 24.27 million reads, with only 13.04 million reads.

 

Median CV coverage

Median CV coverage is calculated by Picard by

  1. calculating the coeficient of variation (CV) in read coverage along the length of each of the 1000 most highly expressed transcripts;
  2. calculating the median CV across these 1000 transcripts.

A high CV of read coverage suggests possible 5’ or 3’ bias, or potentially non-uniform (“bumpy” or “spikey”) coverage of a transcript. If medianCVcoverage is high (> 1), this could indicate a more systemic problem with coverage in the dataset.

Median CV coverage:
All libraries look good (with medianCVcoverage generally close to 0.5) in terms of gene coverage among the top 1000 transcripts.

 

Examining read count data

 

Gene filtering

I used the function defined above to build the DGEList object for the data, which is the input for downstream functions.

# Filter genes with (cpm > 10) in < 10% of samples
dge = setUpDGEList(countData = countDat, geneData = geneDat,
                      filterCount = 10, 
                      filterPercentage = 0.20)
## keepRows
## FALSE  TRUE 
## 26239 11752

Keeping only those genes with >= 10 counts per million in at least 10% ( 3.2 samples), we’re left with 11752 genes.

 

Sanity check

To verify the effect of the change I made to Elizabeth’s code above, I plotted norm.factors and effective library size (lib.size.eff) under two scenarios:

  • preFilter: norm.factors are calculated before genes are filtered
  • postFilter: norm.factors are calculated after genes are filtered and library sizes are updated

For the not-too-stringent threshold used to filter genes (CPM > 10 in 20% of samples), the order of operations for calculating norm.factors appears to have minimal impact on effective library sizes.

 

Gene annotation

I used the biomaRt package to add gene symbols (from MGI) corresponding to gene IDs from Ensembl.

# Get MGI gene symbols corresponding to Ensembl Gene IDs
dgeGeneDat <- dge$genes
mart <- useMart(biomart = "ensembl", dataset = "mmusculus_gene_ensembl")
ens2Gene <- getBM(attributes = c("ensembl_gene_id", "mgi_symbol"), 
                  filters = "ensembl_gene_id", 
                  values = dgeGeneDat$ensemblID, mart = mart)
ens2Gene = ens2Gene[match(dgeGeneDat$ensemblID, ens2Gene$ensembl_gene_id), ]

# Insert MGI gene symbols for genes in DGE object gene info
dgeGeneDat <- dgeGeneDat %>% 
    mutate(mgiSymbol = ens2Gene$mgi_symbol,
           mgiSymbol = ifelse(is.na(mgiSymbol), "NA", mgiSymbol))

dge$genes <- dgeGeneDat

 

Principal component analysis (PCA)

I performed PCA with the prcomp function and plotted the first two principal components using the autoplot function from the ggfortify package.

Looking just at the long-term HSC populations, WT and BCAP knockout mice seem to cluster separately. When short-term populations are included, there is less distinction between knockout groups.

 

Possible sources of confounding

Experimental, sequencing, and alignment variables plotted against the 1st principal component

 

Looking at BCAP counts


Differential expression analysis

Building models

Design 1: expression ~ koStatus

# Define the design matrix, only including terms corresponding to KO status;
# use voom to calculate transformed expression values
koDesign <- model.matrix(~ koStatus, data = groupDat)
koVoom <- voomWithQualityWeights(dge, design = koDesign,
                                    plot = TRUE)

# Fit model for the group design
koFit <- lmFit(koVoom, koDesign)
koFit <- eBayes(koFit)
koResults <- topTable(koFit, number = nrow(dge))
koResults %>% 
    filter(mgiSymbol == "Pik3ap1")
##            ensemblID mgiSymbol logFC AveExpr   t P.Value adj.P.Val    B
## 1 ENSMUSG00000025017   Pik3ap1   1.1     5.9 2.6   0.019      0.34 -3.3

 

Design 2: expression ~ koStatus + hscPop

# Define the design matrix, including terms corresponding to KO status and HSC
# population; use voom to calculate transformed expression values
koHscDesign <- model.matrix(~ koStatus + hscPop, data = groupDat)
koHscVoom <- voomWithQualityWeights(dge, design = koHscDesign,
                                    plot = TRUE)

# Fit model for the group design
koHscFit <- lmFit(koHscVoom, koHscDesign)
koHscFit <- eBayes(koHscFit)
koHscResults <- topTable(koHscFit, coef = 2, number = nrow(dge))
koHscResults %>% 
    filter(mgiSymbol == "Pik3ap1")
##            ensemblID mgiSymbol logFC AveExpr   t P.Value adj.P.Val    B
## 1 ENSMUSG00000025017   Pik3ap1  0.97     5.9 4.4 0.00038     0.087 0.14

 

Design 3: expression ~ koStatus * hscPop (i.e., koStatus + hscPop + koStatus:hscPop)

# Define the design matrix, including terms corresponding to KO status, HSC
# population, and the interaction between the two variables; use voom to 
# calculate transformed expression values
koHscIntDesign <- model.matrix(~ koStatus * hscPop, data = groupDat)
koHscIntVoom <- voomWithQualityWeights(dge, design = koHscIntDesign,
                                    plot = TRUE)

# Fit model for the group design
koHscIntFit <- lmFit(koHscIntVoom, koHscIntDesign)
koHscIntFit <- eBayes(koHscIntFit)
koHscIntResults <- topTable(koHscIntFit, coef = 2, number = nrow(dge))
koHscIntResults %>% 
    filter(mgiSymbol == "Pik3ap1")
##            ensemblID mgiSymbol logFC AveExpr   t P.Value adj.P.Val    B
## 1 ENSMUSG00000025017   Pik3ap1   1.1     5.9 2.6    0.02      0.32 -3.2

 

Model (design) comparison

Genes with significantly different expression (adj. p-value < 0.05) as a function of BCAP KO status:


Long-term HSC mice only

# Get data for libraries from long-term HSC population
groupDatLong <- groupDat %>% 
    filter(hscPop == "long")
countDatLong <- countDat[, names(countDat) %in% groupDatLong$libID]

# Filter genes with (cpm > 10) in < 10% of samples
dgeLong = setUpDGEList(countData = countDatLong, geneData = geneDat,
                      filterCount = 10, 
                      filterPercentage = 0.20)
## keepRows
## FALSE  TRUE 
## 26152 11839

 

Gene annotation
# Get MGI gene symbols corresponding to Ensembl Gene IDs
dgeLongGeneDat <- dgeLong$genes
mart <- useMart(biomart = "ensembl", dataset = "mmusculus_gene_ensembl")
ens2Gene <- getBM(attributes = c("ensembl_gene_id", "mgi_symbol"), 
                  filters = "ensembl_gene_id", 
                  values = dgeLongGeneDat$ensemblID, mart = mart)
ens2Gene = ens2Gene[match(dgeLongGeneDat$ensemblID, ens2Gene$ensembl_gene_id), ]

# Insert MGI gene symbols for genes in DGE object gene info
dgeLongGeneDat <- dgeLongGeneDat %>% 
    mutate(mgiSymbol = ens2Gene$mgi_symbol,
           mgiSymbol = ifelse(is.na(mgiSymbol), "NA", mgiSymbol))

dgeLong$genes <- dgeLongGeneDat

 

Principal component analysis (PCA)

Looking just at the long-term HSC populations, WT and BCAP knockout mice seem to cluster separately. When short-term populations are included, there is less distinction between knockout groups.

 

Possible sources of confounding

Experimental, sequencing, and alignment variables plotted against the 1st principal component

 

Looking at BCAP counts

 

Differential expression models

Design: expression ~ koStatus

# Define the design matrix, only including terms corresponding to KO status;
# use voom to calculate transformed expression values
koDesignLong <- model.matrix(~ koStatus, data = groupDatLong)
koVoomLong <- voomWithQualityWeights(dgeLong, design = koDesignLong,
                                    plot = TRUE)

# Fit model for the group design
koFitLong <- lmFit(koVoomLong, koDesignLong)
koFitLong <- eBayes(koFitLong)
koResultsLong <- topTable(koFitLong, number = nrow(dgeLong))
koResultsLong %>% 
    filter(mgiSymbol == "Pik3ap1")
##            ensemblID mgiSymbol logFC AveExpr   t P.Value adj.P.Val    B
## 1 ENSMUSG00000025017   Pik3ap1   1.1     5.1 2.7   0.022      0.34 -3.1
koResultsLong %>% head()
##                ensemblID mgiSymbol logFC AveExpr    t  P.Value adj.P.Val
## 11177 ENSMUSG00000038418      Egr1  0.94     7.9  7.2 0.000022      0.13
## 18406 ENSMUSG00000064215     Ifi27 -0.81     7.1 -7.0 0.000030      0.13
## 4479  ENSMUSG00000024378    Stard4 -0.76     7.5 -6.3 0.000075      0.13
## 3785  ENSMUSG00000022565      Plec  0.88     7.5  5.9 0.000125      0.13
## 15467 ENSMUSG00000052369  Tmem106c  0.76     6.5  6.0 0.000119      0.13
## 34159 ENSMUSG00000089774    Slc5a3  1.47     5.9  6.1 0.000102      0.13
##         B
## 11177 3.2
## 18406 2.8
## 4479  2.0
## 3785  1.5
## 15467 1.5
## 34159 1.3

 

Short-term HSC mice only

# Get data for libraries from long-term HSC population
groupDatShort <- groupDat %>% 
    filter(hscPop == "short")
countDatShort <- countDat[, names(countDat) %in% groupDatShort$libID]

# Filter genes with (cpm > 10) in < 10% of samples
dgeShort = setUpDGEList(countData = countDatShort, geneData = geneDat,
                      filterCount = 10, 
                      filterPercentage = 0.20)
## keepRows
## FALSE  TRUE 
## 26097 11894

 

Gene annotation
# Get MGI gene symbols corresponding to Ensembl Gene IDs
dgeShortGeneDat <- dgeShort$genes
mart <- useMart(biomart = "ensembl", dataset = "mmusculus_gene_ensembl")
ens2Gene <- getBM(attributes = c("ensembl_gene_id", "mgi_symbol"), 
                  filters = "ensembl_gene_id", 
                  values = dgeShortGeneDat$ensemblID, mart = mart)
ens2Gene = ens2Gene[match(dgeShortGeneDat$ensemblID, ens2Gene$ensembl_gene_id), ]

# Insert MGI gene symbols for genes in DGE object gene info
dgeShortGeneDat <- dgeShortGeneDat %>% 
    mutate(mgiSymbol = ens2Gene$mgi_symbol,
           mgiSymbol = ifelse(is.na(mgiSymbol), "NA", mgiSymbol))

dgeShort$genes <- dgeShortGeneDat

 

Principal component analysis (PCA)

Looking just at the long-term HSC populations, WT and BCAP knockout mice seem to cluster separately. When short-term populations are included, there is less distinction between knockout groups.

 

Possible sources of confounding

Experimental, sequencing, and alignment variables plotted against the 1st principal component

 

Looking at BCAP counts

 

Differential expression models

Design: expression ~ koStatus

# Define the design matrix, only including terms corresponding to KO status;
# use voom to calculate transformed expression values
koDesignShort <- model.matrix(~ koStatus, data = groupDatShort)
koVoomShort <- voomWithQualityWeights(dgeShort, design = koDesignShort,
                                    plot = TRUE)

# Fit model for the group design
koFitShort <- lmFit(koVoomShort, koDesignShort)
koFitShort <- eBayes(koFitShort)
koResultsShort <- topTable(koFitShort, number = nrow(dgeShort))
koResultsShort %>% 
    filter(mgiSymbol == "Pik3ap1")
##            ensemblID mgiSymbol logFC AveExpr   t P.Value adj.P.Val    B
## 1 ENSMUSG00000025017   Pik3ap1  0.92     6.7 3.8  0.0038      0.66 -1.6
koResultsShort %>% head()
##                ensemblID mgiSymbol logFC AveExpr    t P.Value adj.P.Val
## 9143  ENSMUSG00000032690      Oas2 -1.34     6.5 -5.7 0.00024      0.66
## 6486  ENSMUSG00000028037     Ifi44 -0.52     7.3 -5.5 0.00030      0.66
## 4854  ENSMUSG00000025035      Arl3  1.28     5.6  5.6 0.00026      0.66
## 21595 ENSMUSG00000073643     Wdfy1 -0.47     7.7 -5.1 0.00053      0.66
## 20176 ENSMUSG00000068394    Cep152  1.35     5.7  5.3 0.00039      0.66
## 15364 ENSMUSG00000051910      Sox6 -0.88     6.9 -4.9 0.00071      0.66
##            B
## 9143   0.778
## 6486   0.711
## 4854   0.266
## 21595  0.145
## 20176  0.051
## 15364 -0.080